{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "05ecb9fc-7992-439c-a743-88d4d4a97343",
   "metadata": {},
   "source": [
    "# 1-2. Поступает запрос (разбираю худший случай), который надо преобразовать в правильный запрос и из нормального запроса надо извлечь ключевые запросы для поиска"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "52aee07a-f8a4-4187-91c2-3ef20c8c6c3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForCausalLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "acda5f06-9d1c-43da-b805-0a6a71d6af4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MyLLM:\n",
    "    def __init__(self, max_new_tokens = 512, do_sample = False, top_k = 50, top_p = 0.95, num_return_sequences = 1):\n",
    "        print(\"-\"*50)\n",
    "        print(\"Load tokenizer\")\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/deepseek-coder-7b-instruct-v1.5\", trust_remote_code=True)\n",
    "        print(\"-\"*50)\n",
    "        print(\"Load model\")\n",
    "        self.model = AutoModelForCausalLM.from_pretrained(\"deepseek-ai/deepseek-coder-7b-instruct-v1.5\", trust_remote_code=True).cuda()\n",
    "        print(\"-\"*50)\n",
    "        print(\"Finish load\")\n",
    "        \n",
    "        self.messages=[\n",
    "            # { 'role': 'user', 'content': \"write a quick sort algorithm in python.\"}\n",
    "            { 'role': 'user', 'content': \"Hello! How are you?\"}\n",
    "        ]\n",
    "        self.max_new_tokens = max_new_tokens\n",
    "        self.do_sample = do_sample\n",
    "        self.top_k = top_k\n",
    "        self.top_p = top_p\n",
    "        self.eos_token_id = self.tokenizer.eos_token_id\n",
    "        self.num_return_sequences = num_return_sequences\n",
    "        \n",
    "    def invoke(self, message=None):\n",
    "        self.messages = message if message is not None else self.messages\n",
    "        inputs = self.tokenizer.apply_chat_template(self.messages, add_generation_prompt=True, return_tensors=\"pt\").to(self.model.device)\n",
    "        \n",
    "        outputs = self.model.generate(\n",
    "            inputs, \n",
    "            max_new_tokens=self.max_new_tokens, \n",
    "            do_sample=self.do_sample, \n",
    "            top_k=self.top_k, \n",
    "            top_p=self.top_p, \n",
    "            num_return_sequences=self.num_return_sequences, \n",
    "            eos_token_id=self.eos_token_id\n",
    "        )\n",
    "        \n",
    "        answer = self.tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)\n",
    "        print(answer)\n",
    "\n",
    "        return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2d8a5b0d-7327-47eb-83a5-ce2f75de77e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# llm = MyLLM()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2e3fa83e-a6af-4ec9-abad-1c9c6b85e704",
   "metadata": {},
   "outputs": [],
   "source": [
    "# answer = llm.invoke()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b9bac7dd-27e6-4d78-8a2a-9b678524ff16",
   "metadata": {},
   "outputs": [],
   "source": [
    "# answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8fbf062e-b4af-4c7c-b306-16c05d4779eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.embeddings import HuggingFaceEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0b17cd23-fe34-4598-90df-5312494a3345",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Query:\n",
    "    def __init__(self, user_query):\n",
    "        # self.llm_name = \"gpt-3.5-turbo\"\n",
    "        self.embeddings = HuggingFaceEmbeddings()\n",
    "        self.llm = MyLLM()\n",
    "        self.user_query = user_query\n",
    "        self.correct_query = None\n",
    "\n",
    "    def get_correct_query(self):\n",
    "        messages=[\n",
    "            { 'role': 'system', 'content': \"You are an expert at finding repositories on GitHub. A person who does not know how to make such requests has contacted you, he writes a request in simple words, and your task is to make a competent and relevant request. As a response, you need to give only the text of the request that you recommend. Answer only english language\"},\n",
    "            { 'role': 'user', 'content': \"User request: \" + self.user_query}\n",
    "        ]\n",
    "\n",
    "        answer = self.llm.invoke(messages)\n",
    "\n",
    "        return answer\n",
    "\n",
    "    def get_key_queries_from_correct_query(self):\n",
    "        self.correct_query = self.get_correct_query()\n",
    "\n",
    "        messages=[\n",
    "            { 'role': 'system', 'content': \"You are an expert at finding repositories on GitHub. You have been contacted by the person who made up the query, and your task is to use keywords from the user's query to write 5 short queries that the user can best search for. Use no more than 3 words in one search query. Be brief. Write requests separated by commas. Answer only english language\"},\n",
    "            { 'role': 'user', 'content': \"User request: \" + self.correct_query}\n",
    "        ]\n",
    "\n",
    "        answer = self.llm.invoke(messages)\n",
    "\n",
    "        return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0980ddf1-6ef5-4ff6-8cb1-48ecc8cb3b50",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Предположим, что мне пришёл пользовательский запрос\n",
    "uq=\"Я хочу сделать вот такой красивый розовый сайт для продажи собачек разных парод и чтобы сайт был красивым и привлекательным, а так же сайт должен быть продающим\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cc149b50-2d1d-4285-8bb6-cef13cd228f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------\n",
      "Load tokenizer\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------\n",
      "Load model\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0920e0f3c5b04ca68e357552ddfb1872",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------\n",
      "Finish load\n"
     ]
    }
   ],
   "source": [
    "query = Query(user_query=uq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0c1a6226-c3fd-4bd9-a96a-bac610893a7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kama/venv/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:497: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
      "  warnings.warn(\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "To create a beautiful, appealing, and selling dog breed selling website, you would need a combination of web development skills, design expertise, and marketing knowledge. Here's a high-level request that you can use to find the appropriate repositories on GitHub:\n",
      "\n",
      "1. **Frontend Development**: You'll need a modern and responsive frontend framework to ensure your website looks good on all devices. Frameworks like React, Vue.js, or Angular can help you build interactive and dynamic user interfaces.\n",
      "\n",
      "2. **Design**: You'll need a good set of design assets, including images, logos, and color schemes. You might want to look for repositories that contain open-source design resources or templates that you can use as a starting point.\n",
      "\n",
      "3. **E-commerce Features**: If you want to sell products online, you'll need an e-commerce platform. WooCommerce, Magento, or Shopify are popular choices for WordPress, PHP, and JavaScript respectively.\n",
      "\n",
      "4. **SEO Optimization**: To make your website attractive to search engines, you'll need to implement SEO best practices. There are many open-source SEO tools available that can help you with this.\n",
      "\n",
      "5. **Marketing Tools**: To make your website more appealing and sell more dogs, you'll need marketing tools. These could include email marketing platforms, social media integration, and analytics tools.\n",
      "\n",
      "6. **Backend Development**: You'll need a backend to manage your data, including user accounts, product listings, and orders. You might use a CMS like WordPress or a custom backend built with Node.js, Django, or Ruby on Rails.\n",
      "\n",
      "Remember, creating a selling website involves many aspects, and you might need to hire professionals in different areas to get the job done. GitHub repositories can provide you with code snippets, design templates, and tools to help you build your website, but they won't do the entire job for you.\n",
      "\n",
      "1. \"React frontend development\",\n",
      "2. \"Open-source design resources\",\n",
      "3. \"WooCommerce e-commerce platform\",\n",
      "4. \"SEO optimization tools\",\n",
      "5. \"Marketing tools for website\",\n",
      "6. \"WordPress backend development\",\n",
      "7. \"Vue.js frontend development\",\n",
      "8. \"Magento e-commerce platform\",\n",
      "9. \"Shopify e-commerce platform\",\n",
      "10. \"Django backend development\",\n",
      "11. \"Ruby on Rails backend development\",\n",
      "12. \"SEO best practices\",\n",
      "13. \"Email marketing platforms\",\n",
      "14. \"Social media integration tools\",\n",
      "15. \"Analytics tools for website\".\n",
      "\n"
     ]
    }
   ],
   "source": [
    "key_queries = query.get_key_queries_from_correct_query().split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "190e4614-48bc-4e02-9ab7-764d5e54f1de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1. \"React frontend development\"',\n",
       " '\\n2. \"Open-source design resources\"',\n",
       " '\\n3. \"WooCommerce e-commerce platform\"',\n",
       " '\\n4. \"SEO optimization tools\"',\n",
       " '\\n5. \"Marketing tools for website\"',\n",
       " '\\n6. \"WordPress backend development\"',\n",
       " '\\n7. \"Vue.js frontend development\"',\n",
       " '\\n8. \"Magento e-commerce platform\"',\n",
       " '\\n9. \"Shopify e-commerce platform\"',\n",
       " '\\n10. \"Django backend development\"',\n",
       " '\\n11. \"Ruby on Rails backend development\"',\n",
       " '\\n12. \"SEO best practices\"',\n",
       " '\\n13. \"Email marketing platforms\"',\n",
       " '\\n14. \"Social media integration tools\"',\n",
       " '\\n15. \"Analytics tools for website\".\\n']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "key_queries"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c8a1ce6-93c9-4fd4-b960-d8f95a6e15f3",
   "metadata": {},
   "source": [
    "# 3. Поступает список readme который надо преобразовать в embedding и добавить в общую базу данных, если их там нет, если данный readme уже есть в базе данных, то забираем его оттуда. По итогу этого пункта, формируется база данных документов с которыми будет работать LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0bb7a0ca-abac-4402-98fe-ebd37a425171",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fill_database(markdown_path):\n",
    "    text_splitter = RecursiveCharacterTextSplitter(\n",
    "        chunk_size=1500,\n",
    "        chunk_overlap=150\n",
    "    )\n",
    "\n",
    "    loader = UnstructuredMarkdownLoader(markdown_path)\n",
    "    doc = loader.load()\n",
    "    split_docs = text_splitter.split_documents(doc)\n",
    "\n",
    "    # Add the unique documents to your database\n",
    "    return Chroma.from_documents(split_docs, query.embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a34ebd13-e2fc-4abb-b896-f3bd7bb24fdf",
   "metadata": {},
   "source": [
    "# 4-6. Проверить соответствует ли каждый документ правильному запросу или же нет. Если соответствует, то с помощью семилярить получить численное значение на сколько оно соответствует, чтобы выплюнуть ранжированный список. Сделать человеко читаемую выжимку по этому readme на том же языке, что и поступил первичный запрос."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "cdbe4fc0-d1ab-4b8f-96c3-a79f7387fa2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Предположим, что ко мне пришёл список строк с путями к .md файлам\n",
    "paths = ['./README.md', './README1.md']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a079aaa-eb8f-4ae5-9e03-937777e1b4d4",
   "metadata": {},
   "source": [
    "Здесь score почему-то работает наоборот и чем ближе он к нлю, тем больше похожи друг на друга два документа. При возврате надо будет делать 1 - score."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c08a934f-8ac1-46fd-800c-dc65274db6da",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.prompts import PromptTemplate\n",
    "\n",
    "# Build prompt\n",
    "template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use only yes or no. Keep the answer as concise as possible. As a context, you will receive a repository \"readme\" file and you need to answer whether this repository corresponds to the question or not. \n",
    "{context}\n",
    "Question: Does the repository match the user's search query: {question}?\n",
    "Helpful Answer:\"\"\"\n",
    "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "8aa50ac2-aed1-40f4-a8cc-45f07e82b4c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_meta = []\n",
    "out_description = []\n",
    "out_raiting = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c114c4c8-53f7-4eaf-9f4b-7337451e0770",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain_community.document_loaders import UnstructuredMarkdownLoader\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.chains import RetrievalQA\n",
    "from langchain_community.llms import GPT4All\n",
    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "ab3b87fd-285f-46f9-a926-9b3cdf0d7e8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# local_path = (\"/home/user/models/Meta-Llama-3-8B.Q6_K.gguf\")\n",
    "local_path = (\"/home/kama/models/mistral-7b-instruct-v0.2.Q6_K.gguf\")\n",
    "callbacks = [StreamingStdOutCallbackHandler()]\n",
    "llm2 = GPT4All(model=local_path, callbacks=callbacks, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "5d5719c4-75dd-4deb-9654-e1fc9536bfa5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kama/venv/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
      "  warn_deprecated(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " No, this repository does not match the user's search query as it focuses on LangChain, a language model framework, rather than creating a dog breed selling website. No, this repository does not match the user's search query as it focuses on predicting order cancellation in a food delivery system using machine learning algorithms rather than creating a dog breed selling website."
     ]
    }
   ],
   "source": [
    "for path in paths:\n",
    "    vectordb = fill_database(path)\n",
    "\n",
    "    # Run chain\n",
    "    qa_chain = RetrievalQA.from_chain_type(\n",
    "        llm2,\n",
    "        retriever=vectordb.as_retriever(),\n",
    "        return_source_documents=True,\n",
    "        chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT},\n",
    "    )\n",
    "\n",
    "    question = query.correct_query  # Этот запрос по факту нужен, тот что внизу написан для теста\n",
    "    # question = \"I need project with ml for predict delivery food\"\n",
    "\n",
    "    result = qa_chain({\"query\": question})\n",
    "    if result['result'] == \"Yes\":\n",
    "        out_meta.append(result['source_documents'][0].metadata['source'])  # save path from right document\n",
    "        chain = RetrievalQA.from_chain_type(\n",
    "            llm=llm2,\n",
    "            chain_type=\"map_reduce\",\n",
    "            retriever=vectordb.as_retriever()\n",
    "        )\n",
    "        q = \"What about this text? Explain it as for a person who does not have a technical education. Explain in simple words and to the point, but do not write too much - a maximum of 5 sentences.\"\n",
    "        result = chain({\"query\": q})\n",
    "        out_description.append(result['result'])\n",
    "        # забираем максимальное значение сходства для этого документа\n",
    "        out_raiting.append(1 - max([doc[1] for doc in [docs for docs in vectordb.similarity_search_with_score(question)]]))\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "92fb207c-8ef0-4cfd-9767-ca8af5781acd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "9d29c549-7a86-4d73-8d94-80b98d940898",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "9fbd5e7a-e58f-4c6d-8690-5f236be3be26",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_raiting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c7dd1f6-e7a5-4bde-9a23-4ebd7a018576",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Kama (myenv)",
   "language": "python",
   "name": "venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
